Pandas is a fast, powerful, flexible and easy to use open source data analysis and manipulation tool, built on top of the Python programming language. At least it desighened to be fast :-)
import pandas as pd
import csv
import dask.dataframe as dd
import datetime
import functools
import gc
import itertools
import sys
from timeit import default_timer as _timer
import pprint
import cProfile
import pstats
import numpy as np
from functools import reduce
import pandas_profiling
calendar = 'data/calendar_summary.csv'
listings = 'data/listings.csv'
listings_s = 'data/listings_summary.csv'
hood = 'data/neighbourhoods.csv'
reviews = 'data/reviews.csv'
reviews_s = 'data/reviews_summary.csv'
path_list = [calendar, listings, listings_s, hood, reviews, reviews_s]
demand = 'data/demand_profile.csv'
output = 'output/profile.html'
Utils
def timeit(_func=None, *, repeat=3, number=1000, file=sys.stdout):
_repeat = functools.partial(itertools.repeat, None)
def wrap(func):
@functools.wraps(func)
def _timeit(*args, **kwargs):
gcold = gc.isenabled()
gc.disable()
try:
trials = []
for _ in _repeat(repeat):
total = 0
for _ in _repeat(number):
start = _timer()
result = func(*args, **kwargs)
end = _timer()
total += end - start
trials.append(total)
best = min(trials) / number
print(
"Function `{}` ran in average"
" of {:0.6f} seconds.".format(func.__name__, best),
end="\n\n",
file=file,
)
finally:
if gcold:
gc.enable()
return result
return _timeit
if _func is None:
return wrap
else:
return wrap(_func)
def mem_usage(pandas_obj):
if isinstance(pandas_obj,pd.DataFrame):
usage_b = pandas_obj.memory_usage(deep=True).sum()
else:
usage_b = pandas_obj.memory_usage(deep=True)
usage_mb = usage_b / 1024 ** 2
return "{:03.2f} MB".format(usage_mb)
def create_test_df():
return pd.DataFrame({
'A' : ['spam', 'eggs', 'spam', 'eggs'] * 6,
'B' : ['alpha', 'beta', 'gamma'] * 8,
'C' : [np.random.choice(pd.date_range(datetime.datetime(2020,1,1),datetime.datetime(2020,1,3))) for i in range(24)],
'D' : np.random.randn(24),
'E' : np.random.randint(2,10,24),
'F' : [np.random.choice(['rand_1', 'rand_2', 'rand_4', 'rand_6']) for i in range(24)],})
def statistic_exploration(df):
print('The data has {} Rows and {} columns'.format(df.shape[0], df.shape[1]))
col_list = ['Null','Unique_Count','Count_all', 'Data_type', 'Missing_%',
'Max/Min','Mean', 'Median',
'Variance', 'Kurtosis', 'Std', 'Skewness']
df_stat = pd.DataFrame(index=df.columns, columns=col_list)
df_stat['Null'] = list([len(df[col][df[col].isnull()]) for i, col in enumerate(df.columns)])
df_stat['Unique_Count'] = list([len(df[col].unique()) for i, col in enumerate(df.columns)])
df_stat['Count_all'] = list([df[col].count() for i, col in enumerate(df.columns)])
df_stat['Data_type'] = list([df[col].dtype for i, col in enumerate(df.columns)])
df_stat['Missing_%'] = list([abs(100 - ((len(df[col].isnull())*100)/df[col].count()))
for i, col in enumerate(df.columns)])
for i,col in enumerate(df.columns):
if 'float' in str(df[col].dtype) or 'int' in str(df[col].dtype):
df_stat.at[col,'Max/Min'] = str(round(df[col].max(),2)) + '/' + str(round(df[col].min(),2))
df_stat.at[col,'Mean'] = df[col].mean()
df_stat.at[col,'Median'] = df[col].median()
df_stat.at[col,'Variance'] = df[col].var()
df_stat.at[col,'Kurtosis'] = df[col].kurt()
df_stat.at[col,'Std'] = df[col].std()
df_stat.at[col,'Skewness'] = df[col].skew()
print(df.dtypes.value_counts())
print('Data sample:')
print(df.sample(1).T)
return (df_stat.fillna('---'))
def profile_dec(func):
"""Decorator for run function profile"""
def wrapper(*args, **kwargs):
profile_filename = func.__name__ + '.prof'
profiler = cProfile.Profile()
result = profiler.runcall(func, *args, **kwargs)
profiler.dump_stats(profile_filename)
p = pstats.Stats(profile_filename)
p.sort_stats('cumtime').print_stats()
return result
return wrapper
Dataset upload
@timeit(repeat=10, number=10)
def c_read(path):
return pd.read_csv(path, engine='c')
@timeit(repeat=10, number=10)
def p_read(path):
return pd.read_csv(path, engine='python')
@timeit(repeat=10, number=10)
def d_read(path):
return pd.read_csv(path)
@timeit(repeat=10, number=10)
def d_read_low_mem(path):
return pd.read_csv(path, low_memory=True)
#Will not crash memory stack
@timeit(repeat=10, number=10)
def chunk_read(path):
chunks = pd.read_csv(path, chunksize=10**5)
data = pd.concat(chunks)
return data
@timeit(repeat=10, number=10)
def dict_read(path):
return csv.DictReader(open(path))
@timeit(repeat=10, number=10)
def dask_read(path):
return dd.read_csv(path, sample=10000000)
def check_all(path):
c_read(path)
p_read(path)
d_read(path)
d_read_low_mem(path)
chunk_read(path)
dict_read(path)
dask_read(path)
#for p in path_list:
# check_all(p)
check_all(listings)
Function `c_read` ran in average of 0.291502 seconds. Function `p_read` ran in average of 0.736047 seconds. Function `d_read` ran in average of 0.271633 seconds. Function `d_read_low_mem` ran in average of 0.193232 seconds. Function `chunk_read` ran in average of 0.201904 seconds. Function `dict_read` ran in average of 0.001178 seconds. Function `dask_read` ran in average of 0.193135 seconds.
The DictReader is a Python class which maps the data read as a dictionary, whose keys, unless specified are the first row of the CSV. All values in the subsequent rows will be dictionary values and can be accessed with the respective dictionary key. However, these values will be imported as strings. This method very useful for data analysis tasks, but it is valuable in certain other cases (working with the JSON format)
listings_df = pd.read_csv(listings, verbose=True)
Tokenization took: 35.01 ms Type conversion took: 45.75 ms Parser memory cleanup took: 0.01 ms
listings_df.info(memory_usage='deep')
<class 'pandas.core.frame.DataFrame'> RangeIndex: 22552 entries, 0 to 22551 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 22552 non-null int64 1 name 22493 non-null object 2 host_id 22552 non-null int64 3 host_name 22526 non-null object 4 neighbourhood_group 22552 non-null object 5 neighbourhood 22552 non-null object 6 latitude 22552 non-null float64 7 longitude 22552 non-null float64 8 room_type 22552 non-null object 9 price 22552 non-null int64 10 minimum_nights 22552 non-null int64 11 number_of_reviews 22552 non-null int64 12 last_review 18644 non-null object 13 reviews_per_month 18638 non-null float64 14 calculated_host_listings_count 22552 non-null int64 15 availability_365 22552 non-null int64 dtypes: float64(3), int64(7), object(6) memory usage: 11.5 MB
se_df = statistic_exploration(listings_df)
se_df
The data has 22552 Rows and 16 columns
int64 7
object 6
float64 3
dtype: int64
Data sample:
21826
id 29022524
name Gorki Penthouse 1
host_id 42671030
host_name Gorki Apartments
neighbourhood_group Mitte
neighbourhood Brunnenstr. Süd
latitude 52.5304
longitude 13.4028
room_type Entire home/apt
price 1000
minimum_nights 1
number_of_reviews 0
last_review NaN
reviews_per_month NaN
calculated_host_listings_count 6
availability_365 199
| Null | Unique_Count | Count_all | Data_type | Missing_% | Max/Min | Mean | Median | Variance | Kurtosis | Std | Skewness | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| id | 0 | 22552 | 22552 | int64 | 0.000000 | 29867352/2015 | 1.57156e+07 | 1.68664e+07 | 7.31379e+13 | -1.144 | 8.55207e+06 | -0.179928 |
| name | 59 | 21874 | 22493 | object | 0.262304 | --- | --- | --- | --- | --- | --- | --- |
| host_id | 0 | 19180 | 22552 | int64 | 0.000000 | 224508134/2217 | 5.40335e+07 | 3.12671e+07 | 3.38292e+15 | 0.56104 | 5.81629e+07 | 1.26562 |
| host_name | 26 | 5998 | 22526 | object | 0.115422 | --- | --- | --- | --- | --- | --- | --- |
| neighbourhood_group | 0 | 12 | 22552 | object | 0.000000 | --- | --- | --- | --- | --- | --- | --- |
| neighbourhood | 0 | 136 | 22552 | object | 0.000000 | --- | --- | --- | --- | --- | --- | --- |
| latitude | 0 | 22552 | 22552 | float64 | 0.000000 | 52.65/52.35 | 52.5098 | 52.5091 | 0.000950209 | 0.758927 | 0.0308255 | -0.133515 |
| longitude | 0 | 22552 | 22552 | float64 | 0.000000 | 13.76/13.1 | 13.4061 | 13.4168 | 0.00335979 | 2.96943 | 0.0579637 | -0.25517 |
| room_type | 0 | 3 | 22552 | object | 0.000000 | --- | --- | --- | --- | --- | --- | --- |
| price | 0 | 295 | 22552 | int64 | 0.000000 | 9000/0 | 67.1437 | 45 | 48517.2 | 791.146 | 220.266 | 26.7332 |
| minimum_nights | 0 | 102 | 22552 | int64 | 0.000000 | 5000/1 | 7.15706 | 2 | 1653.65 | 10154.6 | 40.6651 | 85.888 |
| number_of_reviews | 0 | 306 | 22552 | int64 | 0.000000 | 498/0 | 17.8407 | 5 | 1352.01 | 26.6259 | 36.7696 | 4.38296 |
| last_review | 3908 | 1313 | 18644 | object | 20.961167 | --- | --- | --- | --- | --- | --- | --- |
| reviews_per_month | 3914 | 769 | 18638 | float64 | 21.000107 | 36.67/0.01 | 1.13553 | 0.54 | 2.2713 | 34.3418 | 1.50708 | 3.39166 |
| calculated_host_listings_count | 0 | 23 | 22552 | int64 | 0.000000 | 45/1 | 1.91823 | 1 | 13.4488 | 75.7204 | 3.66726 | 7.86513 |
| availability_365 | 0 | 366 | 22552 | int64 | 0.000000 | 365/0 | 79.8528 | 4 | 14248.8 | 0.109892 | 119.368 | 1.29135 |
Numeric optimization
for dtype in ['float','int','object']:
selected_dtype = listings_df.select_dtypes(include=[dtype])
mean_usage_b = selected_dtype.memory_usage(deep=True).mean()
mean_usage_mb = mean_usage_b / 1024 ** 2
print("Average memory usage for {} columns: {:03.2f} MB".format(dtype, mean_usage_mb))
Average memory usage for float columns: 0.13 MB Average memory usage for int columns: 0.15 MB Average memory usage for object columns: 1.44 MB
int_types = ["uint8", "int8", "int16"]
for it in int_types:
print(np.iinfo(it))
Machine parameters for uint8 --------------------------------------------------------------- min = 0 max = 255 --------------------------------------------------------------- Machine parameters for int8 --------------------------------------------------------------- min = -128 max = 127 --------------------------------------------------------------- Machine parameters for int16 --------------------------------------------------------------- min = -32768 max = 32767 ---------------------------------------------------------------
Uint 8 best choice if we do not have negative values
list_int = listings_df.select_dtypes(include=['int'])
converted_int = list_int.apply(pd.to_numeric, downcast='unsigned')
print(mem_usage(list_int))
print(mem_usage(converted_int))
compare_ints = pd.concat([list_int.dtypes,converted_int.dtypes],axis=1)
compare_ints.columns = ['before','after']
compare_ints.apply(pd.Series.value_counts)
1.20 MB 0.37 MB
| before | after | |
|---|---|---|
| uint8 | NaN | 1.0 |
| uint16 | NaN | 4.0 |
| uint32 | NaN | 2.0 |
| int64 | 7.0 | NaN |
list_float = listings_df.select_dtypes(include=['float'])
converted_float = list_float.apply(pd.to_numeric,downcast='float')
print(mem_usage(list_float))
print(mem_usage(converted_float))
compare_floats = pd.concat([list_float.dtypes, converted_float.dtypes], axis=1)
compare_floats.columns = ['before','after']
compare_floats.apply(pd.Series.value_counts)
0.52 MB 0.26 MB
| before | after | |
|---|---|---|
| float32 | NaN | 3.0 |
| float64 | 3.0 | NaN |
optimized_listings = listings_df.copy()
optimized_listings[converted_int.columns] = converted_int
optimized_listings[converted_float.columns] = converted_float
print(mem_usage(listings_df))
print(mem_usage(optimized_listings))
11.79 MB 10.69 MB
Objects optimizations
listings_obj = listings_df.select_dtypes(include=['object']).copy()
listings_obj.describe()
| name | host_name | neighbourhood_group | neighbourhood | room_type | last_review | |
|---|---|---|---|---|---|---|
| count | 22493 | 22526 | 22552 | 22552 | 22552 | 18644 |
| unique | 21873 | 5997 | 12 | 136 | 3 | 1312 |
| top | Berlin Wohnung | Anna | Friedrichshain-Kreuzberg | Tempelhofer Vorstadt | Private room | 2018-11-04 |
| freq | 14 | 216 | 5497 | 1325 | 11534 | 618 |
converted_obj = pd.DataFrame()
for col in listings_obj.columns:
num_unique_values = len(listings_obj[col].unique())
num_total_values = len(listings_obj[col])
if num_unique_values / num_total_values < 0.5:
converted_obj.loc[:,col] = listings_obj[col].astype('category')
else:
converted_obj.loc[:,col] = listings_obj[col]
print(mem_usage(listings_obj))
print(mem_usage(converted_obj))
compare_obj = pd.concat([listings_obj.dtypes, converted_obj.dtypes], axis=1)
compare_obj.columns = ['before', 'after']
compare_obj.apply(pd.Series.value_counts)
10.06 MB 2.96 MB
| before | after | |
|---|---|---|
| object | 6.0 | 1 |
| category | NaN | 1 |
| category | NaN | 1 |
| category | NaN | 1 |
| category | NaN | 1 |
| category | NaN | 1 |
optimized_listings[converted_obj.columns] = converted_obj
print(mem_usage(listings_df))
print(mem_usage(optimized_listings))
11.79 MB 3.59 MB
dtypes = optimized_listings.dtypes
dtypes_col = dtypes.index
dtypes_type = [i.name for i in dtypes.values]
column_types = dict(zip(dtypes_col, dtypes_type))
preview = first2pairs = {key: value for key, value in list(column_types.items())[:10]}
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(preview)
{ 'host_id': 'uint32',
'host_name': 'category',
'id': 'uint32',
'latitude': 'float32',
'longitude': 'float32',
'name': 'object',
'neighbourhood': 'category',
'neighbourhood_group': 'category',
'price': 'uint16',
'room_type': 'category'}
read_and_optimized = pd.read_csv(listings, dtype=column_types)
not_optimizes = pd.read_csv(listings)
print(mem_usage(read_and_optimized))
print(mem_usage(not_optimizes))
3.59 MB 11.50 MB
Datetime optimization
reviews_df = pd.read_csv(reviews, verbose=True).rename(columns={"listing_id": "id"})
Tokenization took: 63.19 ms Type conversion took: 36.44 ms Parser memory cleanup took: 0.51 ms Tokenization took: 25.43 ms Type conversion took: 16.40 ms Parser memory cleanup took: 0.01 ms
@timeit(repeat=3, number=10)
def convert(df, column_name):
return pd.to_datetime(df[column_name])
reviews_df['date'] = convert(reviews_df, 'date')
Function `convert` ran in average of 0.108751 seconds.
@timeit(repeat=3, number=100)
def convert_with_format(df, column_name):
return pd.to_datetime(df[column_name], format='%Y-%m-%d %H:%M:%S.%f')
reviews_df['date'] = convert_with_format(reviews_df, 'date')
Function `convert_with_format` ran in average of 0.027986 seconds.
Index optimization
@timeit(repeat=3, number=100)
def simple_merge():
listings_df.merge(reviews_df, on='id')
simple_merge()
Function `simple_merge` ran in average of 0.617740 seconds.
reviews_df1 = reviews_df.set_index('id')
listings_df1 = listings_df.set_index('id')
#@profile_dec
@timeit(repeat=3, number=100)
def index_merge():
listings_df1.merge(reviews_df1, left_index=True, right_index=True)
index_merge()
Function `index_merge` ran in average of 0.190223 seconds.
@profile_dec
def index_merge():
listings_df1.merge(reviews_df1, left_index=True, right_index=True)
index_merge()
Tue Dec 8 13:34:39 2020 index_merge.prof
5553 function calls (5538 primitive calls) in 0.180 seconds
Ordered by: cumulative time
ncalls tottime percall cumtime percall filename:lineno(function)
1 0.008 0.008 0.180 0.180 <ipython-input-55-98f9b8d1e284>:1(index_merge)
1 0.000 0.000 0.172 0.172 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/frame.py:7931(merge)
1 0.001 0.001 0.172 0.172 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/reshape/merge.py:57(merge)
1 0.000 0.000 0.171 0.171 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/reshape/merge.py:664(get_result)
1 0.000 0.000 0.165 0.165 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/concat.py:31(concatenate_block_managers)
10 0.000 0.000 0.161 0.016 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/concat.py:306(_concatenate_join_units)
10 0.000 0.000 0.156 0.016 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/concat.py:317(<listcomp>)
10 0.026 0.003 0.156 0.016 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/concat.py:233(get_reindexed_values)
19 0.001 0.000 0.130 0.007 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/algorithms.py:1616(take_nd)
23 0.046 0.002 0.046 0.002 {built-in method numpy.empty}
4 0.040 0.010 0.040 0.010 {pandas._libs.algos.take_2d_axis1_object_object}
4 0.013 0.003 0.013 0.003 {pandas._libs.algos.take_2d_axis0_int64_int64}
3 0.010 0.003 0.010 0.003 {pandas._libs.algos.take_2d_axis1_int64_int64}
4 0.009 0.002 0.009 0.002 {pandas._libs.algos.take_2d_axis0_object_object}
2 0.005 0.003 0.005 0.003 {pandas._libs.algos.take_2d_axis0_float64_float64}
1 0.000 0.000 0.005 0.005 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/reshape/merge.py:866(_get_join_info)
1 0.000 0.000 0.005 0.005 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/indexes/base.py:3415(join)
1 0.000 0.000 0.005 0.005 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/indexes/base.py:3783(_join_monotonic)
1 0.005 0.005 0.005 0.005 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/indexes/base.py:259(_inner_indexer)
10 0.000 0.000 0.004 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/concat.py:351(_get_empty_dtype_and_na)
2 0.004 0.002 0.004 0.002 {pandas._libs.algos.take_2d_axis1_float64_float64}
1 0.000 0.000 0.003 0.003 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/algorithms.py:1327(wrapper)
10 0.000 0.000 0.003 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/concat.py:194(dtype)
10 0.003 0.000 0.003 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/concat.py:185(needs_filling)
10 0.000 0.000 0.002 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/concat.py:465(_is_uniform_join_units)
40 0.000 0.000 0.002 0.000 {built-in method builtins.all}
20 0.000 0.000 0.002 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/concat.py:480(<genexpr>)
10 0.000 0.000 0.002 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/concat.py:204(is_na)
1 0.002 0.002 0.002 0.002 {method 'copy' of 'numpy.ndarray' objects}
18 0.000 0.000 0.001 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/missing.py:47(isna)
18 0.000 0.000 0.001 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/missing.py:130(_isna)
7 0.000 0.000 0.001 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/missing.py:193(_isna_ndarraylike)
1071 0.001 0.000 0.001 0.000 {built-in method builtins.isinstance}
1 0.000 0.000 0.001 0.001 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/reshape/merge.py:2066(_items_overlap_with_suffix)
1 0.000 0.000 0.001 0.001 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/indexes/base.py:2619(intersection)
6/3 0.000 0.000 0.001 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/indexes/base.py:293(__new__)
4 0.000 0.000 0.001 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/missing.py:235(_isna_string_dtype)
10 0.000 0.000 0.001 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/blocks.py:2701(make_block)
123 0.000 0.000 0.001 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/base.py:256(is_dtype)
4 0.001 0.000 0.001 0.000 {built-in method pandas._libs.missing.isnaobj}
19 0.000 0.000 0.001 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/cast.py:442(maybe_promote)
10 0.000 0.000 0.001 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/blocks.py:2655(get_block_type)
19 0.000 0.000 0.001 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/algorithms.py:1487(_get_take_nd_function)
475 0.000 0.000 0.001 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/generic.py:10(_check)
762 0.001 0.000 0.001 0.000 {built-in method builtins.getattr}
32 0.001 0.000 0.001 0.000 {method 'reduce' of 'numpy.ufunc' objects}
1 0.000 0.000 0.001 0.001 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/indexes/base.py:2957(get_indexer)
25 0.000 0.000 0.001 0.000 {method 'any' of 'numpy.ndarray' objects}
123 0.000 0.000 0.001 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/common.py:1460(is_extension_array_dtype)
25 0.000 0.000 0.001 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/numpy/core/_methods.py:53(_any)
48 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/numpy/core/_dtype.py:321(_name_get)
5 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/indexes/base.py:5559(ensure_index)
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/indexes/base.py:4133(append)
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/indexes/base.py:4161(_concat)
38 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/common.py:530(is_categorical_dtype)
42 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/common.py:381(is_datetime64tz_dtype)
27 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/common.py:456(is_period_dtype)
24 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/dtypes.py:906(is_dtype)
123 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/base.py:413(find)
25 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/common.py:492(is_interval_dtype)
48 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/numpy/core/_dtype.py:307(_name_includes_bit_suffix)
24 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/common.py:224(is_sparse)
22 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/dtypes.py:1119(is_dtype)
7 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/common.py:566(is_string_dtype)
11 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/common.py:1541(_is_dtype)
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/concat.py:110(concat_compat)
40 0.000 0.000 0.000 0.000 <frozen importlib._bootstrap>:997(_handle_fromlist)
27 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/numpy/core/numerictypes.py:360(issubdtype)
9 0.000 0.000 0.000 0.000 {built-in method builtins.any}
7 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/common.py:595(condition)
3 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/indexes/base.py:5726(_maybe_cast_data_without_dtype)
19 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/construction.py:339(extract_array)
4 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/common.py:598(is_excluded_dtype)
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/concat.py:49(<listcomp>)
19 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/cast.py:598(_ensure_dtype_type)
2 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/concat.py:87(_get_mgr_concatenation_plan)
55 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/common.py:1600(_is_dtype_type)
16 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/common.py:603(<genexpr>)
488 0.000 0.000 0.000 0.000 {built-in method builtins.issubclass}
5 0.000 0.000 0.000 0.000 {pandas._libs.lib.infer_dtype}
7 0.000 0.000 0.000 0.000 {method 'all' of 'numpy.ndarray' objects}
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/concat.py:29(get_dtype_kinds)
10 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/blocks.py:124(__init__)
7 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/numpy/core/_methods.py:56(_all)
19 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/common.py:1296(is_float_dtype)
54 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/numpy/core/numerictypes.py:286(issubclass_)
93 0.000 0.000 0.000 0.000 {built-in method builtins.hasattr}
191/179 0.000 0.000 0.000 0.000 {built-in method builtins.len}
6 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/common.py:1330(is_bool_dtype)
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/indexes/numeric.py:258(_wrap_joined_index)
31 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/numpy/core/_asarray.py:14(asarray)
34 0.000 0.000 0.000 0.000 {built-in method numpy.array}
6 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/common.py:218(asarray_tuplesafe)
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/reshape/merge.py:576(__init__)
8 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/indexes/base.py:5656(maybe_extract_name)
15 0.000 0.000 0.000 0.000 {method 'ravel' of 'numpy.ndarray' objects}
5 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/numpy/core/numerictypes.py:569(find_common_type)
2 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/indexes/base.py:4196(equals)
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/managers.py:132(__init__)
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/indexes/numeric.py:50(__new__)
4 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/blocks.py:2374(__init__)
3 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/common.py:1180(needs_i8_conversion)
10 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/concat.py:490(_is_uniform_reindex)
11 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/concat.py:525(_combine_concat_plans)
3 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/indexes/base.py:5672(_maybe_cast_with_dtype)
4 0.000 0.000 0.000 0.000 <__array_function__ internals>:2(min_scalar_type)
5 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/indexes/base.py:463(_simple_new)
5 0.000 0.000 0.000 0.000 {built-in method numpy.core._multiarray_umath.implement_array_function}
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/managers.py:321(_verify_integrity)
25 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/common.py:1565(_get_dtype)
1 0.000 0.000 0.000 0.000 {method 'get_indexer' of 'pandas._libs.index.IndexEngine' objects}
29 0.000 0.000 0.000 0.000 {method 'format' of 'str' objects}
9 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/common.py:194(is_object_dtype)
6 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/common.py:806(is_unsigned_integer_dtype)
20 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/concat.py:493(<genexpr>)
48 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/numpy/core/_dtype.py:24(_kind_name)
2 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/indexes/base.py:2000(inferred_type)
4 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/blocks.py:2380(is_bool)
6 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/common.py:1223(is_numeric_dtype)
34 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/common.py:180(<lambda>)
19 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/common.py:422(is_timedelta64_dtype)
5 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/numpy/core/numerictypes.py:621(<listcomp>)
3 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/managers.py:212(shape)
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/indexes/base.py:1646(is_unique)
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/indexes/base.py:1685(is_boolean)
2 0.000 0.000 0.000 0.000 {pandas._libs.lib.is_list_like}
2 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/ops/__init__.py:88(get_op_result_name)
19 0.000 0.000 0.000 0.000 {method 'get' of 'dict' objects}
2 0.000 0.000 0.000 0.000 {method 'take' of 'numpy.ndarray' objects}
6 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/common.py:750(is_signed_integer_dtype)
37 0.000 0.000 0.000 0.000 {pandas._libs.lib.is_scalar}
8 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/inference.py:322(is_hashable)
34 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/common.py:178(classes)
21 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/common.py:188(<lambda>)
10 0.000 0.000 0.000 0.000 {method 'add' of 'pandas._libs.internals.BlockPlacement' objects}
19 0.000 0.000 0.000 0.000 {pandas._libs.algos.ensure_int64}
2 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/generic.py:447(_info_axis)
9 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/common.py:1733(pandas_dtype)
10 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/blocks.py:237(mgr_locs)
3 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/common.py:696(is_integer_dtype)
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/indexes/base.py:701(take)
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/indexes/base.py:4717(_maybe_promote)
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/blocks.py:2048(__init__)
9 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/managers.py:214(<genexpr>)
3 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/common.py:1025(is_datetime_or_timedelta_dtype)
3 0.000 0.000 0.000 0.000 /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/abc.py:180(__instancecheck__)
11 0.000 0.000 0.000 0.000 {built-in method pandas._libs.missing.checknull}
4 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/common.py:1265(is_string_like_dtype)
4 0.000 0.000 0.000 0.000 {built-in method pandas._libs.lib.is_bool_array}
20 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/concat.py:476(<genexpr>)
20 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/concat.py:483(<genexpr>)
1 0.000 0.000 0.000 0.000 <__array_function__ internals>:2(concatenate)
2 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/missing.py:358(array_equivalent)
8 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/common.py:348(is_datetime64_dtype)
4 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/common.py:608(is_dtype_equal)
10 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/blocks.py:135(_check_ndim)
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/reshape/merge.py:951(_get_merge_keys)
1 0.000 0.000 0.000 0.000 {built-in method builtins.sum}
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/frame.py:441(__init__)
12 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/indexes/base.py:567(__len__)
3 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/blocks.py:213(get_values)
12 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/common.py:905(is_datetime64_any_dtype)
2 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/ops/__init__.py:111(_maybe_match_name)
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/indexes/base.py:554(_engine)
10 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/blocks.py:276(__len__)
11 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/managers.py:323(<genexpr>)
3 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/reshape/merge.py:2048(_any)
10 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/numpy/core/numerictypes.py:545(_can_coerce_all)
2 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/numpy/core/_dtype.py:178(_datetime_metadata_str)
3 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/indexes/base.py:1571(is_monotonic)
3 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/concat.py:148(<genexpr>)
10 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/concat.py:174(__init__)
2 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/reshape/merge.py:2052(_validate_operand)
7 0.000 0.000 0.000 0.000 {built-in method builtins.max}
4 0.000 0.000 0.000 0.000 /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/_weakrefset.py:70(__contains__)
4 0.000 0.000 0.000 0.000 {method 'reshape' of 'numpy.ndarray' objects}
19 0.000 0.000 0.000 0.000 {pandas._libs.lib.is_float}
3 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/common.py:197(any_not_none)
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/indexes/base.py:498(_shallow_copy)
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/indexes/base.py:1182(name)
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/indexes/base.py:2835(_assert_can_do_setop)
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/generic.py:471(ndim)
10 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/blocks.py:315(dtype)
12 0.000 0.000 0.000 0.000 {method 'items' of 'dict' objects}
21 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/common.py:183(classes_and_not_datetimelike)
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/generic.py:195(__init__)
11 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/managers.py:216(ndim)
30 0.000 0.000 0.000 0.000 {method 'append' of 'list' objects}
10 0.000 0.000 0.000 0.000 {method 'values' of 'dict' objects}
5 0.000 0.000 0.000 0.000 {built-in method __new__ of type object at 0x1014c36b0}
3 0.000 0.000 0.000 0.000 {method 'view' of 'numpy.ndarray' objects}
3 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/indexes/base.py:1578(is_monotonic_increasing)
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/managers.py:138(<listcomp>)
2 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/indexes/base.py:520(is_)
5 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/indexes/base.py:544(_reset_identity)
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/indexes/base.py:723(_assert_take_fillable)
15 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/indexes/base.py:3870(_values)
4 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/indexes/base.py:3896(_get_engine_target)
30 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/blocks.py:233(mgr_locs)
10 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/blocks.py:311(shape)
8 0.000 0.000 0.000 0.000 {built-in method builtins.hash}
5 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/numpy/core/numerictypes.py:622(<listcomp>)
5 0.000 0.000 0.000 0.000 {pandas._libs.algos.ensure_platform_int}
4 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/common.py:1293(<lambda>)
14 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/indexes/base.py:1175(name)
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/indexes/base.py:4156(<setcomp>)
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/indexes/base.py:4165(<listcomp>)
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/concat.py:139(<listcomp>)
2 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/concat.py:144(<genexpr>)
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/reshape/merge.py:775(_maybe_add_join_keys)
9 0.000 0.000 0.000 0.000 {method 'copy' of 'dict' objects}
1 0.000 0.000 0.000 0.000 {method 'nonzero' of 'numpy.ndarray' objects}
1 0.000 0.000 0.000 0.000 {method 'to_datetime64' of 'pandas._libs.tslibs.nattype._NaT' objects}
3 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/common.py:268(maybe_make_list)
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/missing.py:665(clean_reindex_fill_method)
2 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/concat.py:128(is_nonempty)
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/blocks.py:2056(_maybe_coerce_values)
2 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/managers.py:233(_is_single_block)
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/reshape/merge.py:1067(_maybe_coerce_merge_keys)
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/reshape/merge.py:1198(_validate_specification)
2 0.000 0.000 0.000 0.000 {built-in method numpy.datetime_data}
1 0.000 0.000 0.000 0.000 {pandas._libs.lib.is_integer}
2 0.000 0.000 0.000 0.000 {pandas._libs.lib.is_bool}
2 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/indexes/base.py:590(dtype)
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/indexes/base.py:2480(_validate_sort_keyword)
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/dtypes/concat.py:147(<setcomp>)
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/frame.py:421(_constructor)
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/indexes/numeric.py:81(_validate_dtype)
3 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/blocks.py:229(fill_value)
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/blocks.py:2052(_can_hold_na)
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/managers.py:163(blknos)
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/managers.py:259(items)
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/generic.py:5095(__finalize__)
1 0.000 0.000 0.000 0.000 {pandas._libs.internals.get_blkno_placements}
2 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/frame.py:568(axes)
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/reshape/merge.py:740(_maybe_restore_index_levels)
2 0.000 0.000 0.000 0.000 {method 'add' of 'set' objects}
1 0.000 0.000 0.000 0.000 {method 'startswith' of 'str' objects}
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/numpy/core/multiarray.py:143(concatenate)
4 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/numpy/core/multiarray.py:583(min_scalar_type)
1 0.000 0.000 0.000 0.000 {method 'disable' of '_lsprof.Profiler' objects}
1 0.000 0.000 0.000 0.000 {pandas._libs.lib.is_iterator}
3 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/common.py:201(<genexpr>)
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/missing.py:75(clean_fill_method)
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/indexes/base.py:561(<lambda>)
2 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/indexes/base.py:1378(nlevels)
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/indexes/base.py:5645(_validate_join_method)
1 0.000 0.000 0.000 0.000 /Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/core/internals/managers.py:179(blklocs)
listings_2 = listings_df.set_index('id', drop=False)
%timeit listings_2.loc[29844866, 'name']
12.6 µs ± 424 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
%timeit listings_2.at[29844866, 'name']
6.73 µs ± 121 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
listings_reset = listings_df.reset_index(drop=True)
%timeit listings_reset.loc[listings_reset['id'] == 29844866, 'name']
452 µs ± 36.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
%timeit listings_reset.iloc[22529]['name']
225 µs ± 5.52 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
Filter_optimization
#listings_s = listings_df.sample(frac=0.2, random_state=1337)
%timeit listings_df.merge(reviews_df, on='id')
377 ms ± 139 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
def optimized_merge(df1, df2, merge_column):
df2 = df2[df2[merge_column].isin(df1[merge_column])]
return df1.merge(df2, on=merge_column)
%timeit optimized_merge(listings_df, reviews_df, 'id')
212 ms ± 6.87 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
When chaining multiple operations it is worthwhile to think about which operations to execute first. Filter steps should be executed as early as possible. Even when making inner joins between dataframes, it is worthwhile to filter before merging. If we sample our listings with .sample(frac=0.2) and then merge the reviews to it, we see that its more efficient to filter the reviews first.
Vectorization
Vectorization is the process of executing operations on entire arrays. Similarly to numpy, Pandas has built in optimizations for vectorized operations. It is advised to avoid for loops when working with dataframes, since read and write operations are costly. It is not always possible to vectorize, so i will also show what the best iterative options for pandas are by comparing .iloc[], .iterrows(), .loc[]and.map()/.apply().
What Pandas developers said?
his is the general order of precedence for performance of various operations:
1) vectorization 2) using a custom cython routine 3) apply a) reductions that can be performed in cython b) iteration in python space 4) itertuples 5) iterrows 6) updating an empty frame (e.g. using loc one-row-at-a-time)
Using a custom Cython routine is usually too complicated, so let's skip that for now.
1) Vectorization is ALWAYS, ALWAYS the first and best choice. However, there is a small set of cases (usually involving a recurrence) which cannot be vectorized in obvious ways. Furthermore, on a smallish DataFrame, it may be faster to use other methods.
3) apply usually can be handled by an iterator in Cython space. This is handled internally by pandas, though it depends on what is going on inside the apply expression. For example, df.apply(lambda x: np.sum(x)) will be executed pretty swiftly, though of course, df.sum(1) is even better. However something like df.apply(lambda x: x['b'] + 1) will be executed in Python space, and consequently is much slower.
4) itertuples does not box the data into a Series. It just returns the data in the form of tuples.
5) iterrows DOES box the data into a Series. Unless you really need this, use another method.
6) Updating an empty frame a-single-row-at-a-time. I have seen this method used WAY too much. It is by far the slowest. It is probably common place (and reasonably fast for some python structures), but a DataFrame does a fair number of checks on indexing, so this will always be very slow to update a row at a time. Much better to create new structures and concat.
demand_df = d_read(demand)
demand_df['date_time'] = pd.to_datetime(demand_df['date_time'])
dd = demand_df.sample(400)
Function `d_read` ran in average of 0.006361 seconds.
def apply_tariff(kwh, hour):
"""Calculates cost of electricity for given hour."""
if 0 <= hour < 7:
rate = 12
elif 7 <= hour < 17:
rate = 20
elif 17 <= hour < 24:
rate = 28
else:
raise ValueError(f'Invalid hour: {hour}')
return rate * kwh
#IT IS VERY BAD SOLUTION
@timeit(repeat=3, number=100)
def apply_tariff_loop(df):
energy_cost_list = []
for i in range(len(df)):
# Get electricity used and hour of day
energy_used = df.iloc[i]['energy_kwh']
hour = df.iloc[i]['date_time'].hour
energy_cost = apply_tariff(energy_used, hour)
energy_cost_list.append(energy_cost)
df['cost_cents'] = energy_cost_list
apply_tariff_loop(dd)
Function `apply_tariff_loop` ran in average of 0.227377 seconds.
You can consider the above to be an “antipattern” in Pandas for several reasons. Firstly, it needs to initialize a list in which the outputs will be recorded.
Secondly, it uses the opaque object range(0, len(df)) to loop over, and then after applying apply_tariff(), it has to append the result to a list that is used to make the new DataFrame column. It also does what is called chained indexing with df.iloc[i]['date_time'], which often leads to unintended results.
@timeit(repeat=3, number=100)
def apply_tariff_iterrows(df):
energy_cost_list = []
for index, row in df.iterrows():
# Get electricity used and hour of day
energy_used = row['energy_kwh']
hour = row['date_time'].hour
# Append cost list
energy_cost = apply_tariff(energy_used, hour)
energy_cost_list.append(energy_cost)
df['cost_cents'] = energy_cost_list
apply_tariff_iterrows(dd)
Function `apply_tariff_iterrows` ran in average of 0.059319 seconds.
.itertuples() yields a namedtuple for each row, with the row’s index value as the first element of the tuple. A nametuple is a data structure from Python’s collections module that behaves like a Python tuple but has fields accessible by attribute lookup.
.iterrows() yields pairs (tuples) of (index, Series) for each row in the DataFrame.
@timeit(repeat=3, number=100)
def apply_tariff_withapply(df):
df['cost_cents'] = df.apply(
lambda row: apply_tariff(
kwh=row['energy_kwh'],
hour=row['date_time'].hour),
axis=1)
apply_tariff_withapply(dd)
Function `apply_tariff_withapply` ran in average of 0.010794 seconds.
Pandas’ .apply() method takes functions (callables) and applies them along an axis of a DataFrame (all rows, or all columns).
dd.set_index('date_time', inplace=True)
@timeit(repeat=3, number=100)
def apply_tariff_isin(df):
# Define hour range Boolean arrays
peak_hours = df.index.hour.isin(range(17, 24))
shoulder_hours = df.index.hour.isin(range(7, 17))
off_peak_hours = df.index.hour.isin(range(0, 7))
# Apply tariffs to hour ranges
df.loc[peak_hours, 'cost_cents'] = df.loc[peak_hours, 'energy_kwh'] * 28
df.loc[shoulder_hours,'cost_cents'] = df.loc[shoulder_hours, 'energy_kwh'] * 20
df.loc[off_peak_hours,'cost_cents'] = df.loc[off_peak_hours, 'energy_kwh'] * 12
apply_tariff_isin(dd)
Function `apply_tariff_isin` ran in average of 0.003570 seconds.
But how can you apply condition calculations as vectorized operations in Pandas? One trick is to select and group parts the DataFrame based on your conditions and then apply a vectorized operation to each selected group.
In this next example, you will see how to select rows with Pandas’ .isin() method and then apply the appropriate tariff in a vectorized operation
@timeit(repeat=3, number=100)
def apply_tariff_cut(df):
cents_per_kwh = pd.cut(x=df.index.hour,
bins=[0, 7, 17, 24],
include_lowest=True,
labels=[12, 20, 28]).astype(int)
df['cost_cents'] = cents_per_kwh * df['energy_kwh']
apply_tariff_cut(dd)
Function `apply_tariff_cut` ran in average of 0.001685 seconds.
In apply_tariff_isin(), we are still admittedly doing some “manual work” by calling df.loc and df.index.hour.isin() three times each.
This is a fully vectorized way to get to your intended result, and it comes out on top in terms of timing
@timeit(repeat=3, number=100)
def apply_tariff_digitize(df):
prices = np.array([12, 20, 28])
bins = np.digitize(df.index.hour.values, bins=[7, 17, 24])
df['cost_cents'] = prices[bins] * df['energy_kwh'].values
apply_tariff_digitize(dd)
Function `apply_tariff_digitize` ran in average of 0.000304 seconds.
NumPy’s digitize() function. It is similar to Pandas’ cut() in that the data will be binned, but this time it will be represented by an array of indexes representing which bin each hour belongs to.
.eval() .query()
nrows, ncols = 100000, 100
rng = np.random.RandomState(42)
df1, df2, df3, df4 = (pd.DataFrame(rng.rand(nrows, ncols))
for i in range(4))
%timeit pd.eval('df1 + df2 + df3 + df4')
97.6 ms ± 905 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit df1 + df2 + df3 + df4
92.4 ms ± 1.94 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
df_query = pd.DataFrame(rng.rand(1000, 3), columns=['A', 'B', 'C'])
df_query
| A | B | C | |
|---|---|---|---|
| 0 | 0.615875 | 0.525167 | 0.047354 |
| 1 | 0.330858 | 0.412879 | 0.441564 |
| 2 | 0.689047 | 0.559068 | 0.230350 |
| 3 | 0.290486 | 0.695479 | 0.852587 |
| 4 | 0.424280 | 0.534344 | 0.245216 |
| ... | ... | ... | ... |
| 995 | 0.746470 | 0.545752 | 0.614236 |
| 996 | 0.223362 | 0.748549 | 0.550715 |
| 997 | 0.047455 | 0.450104 | 0.468734 |
| 998 | 0.895440 | 0.894235 | 0.209260 |
| 999 | 0.254963 | 0.861045 | 0.704858 |
1000 rows × 3 columns
%timeit df_query[(df_query.A < 0.5) & (df_query.B < 0.5)]
902 µs ± 49.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
%timeit df_query.query('A < 0.5 and B < 0.5')
2.28 ms ± 183 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Cmean = df_query['C'].mean()
%timeit result1 = df_query[(df_query.A < Cmean) & (df_query.B < Cmean)]
%timeit result2 = df_query.query('A < @Cmean and B < @Cmean')
798 µs ± 44.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each) 2.13 ms ± 60.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
Tricks (15 - 20 min)
# Trick 1
profile = listings_df.profile_report()
#profile.to_file(output_file=output)
profile
# Trick 2
df = pd.read_clipboard()
df
| # | Trick | 15 |
|---|
# Trick 3
# You can use numpy without importing it
pd.np.nan
/Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/ipykernel_launcher.py:3: FutureWarning: The pandas.np module is deprecated and will be removed from pandas in a future version. Import numpy directly instead This is separate from the ipykernel package so we can avoid doing imports until
nan
# Trick 4
test_frame = pd.util.testing.makeDataFrame()
test_frame
/Users/okravchenko/PycharmProjects/pandas_tricks/myvenv/lib/python3.6/site-packages/pandas/util/__init__.py:23: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead. import pandas.util.testing
| A | B | C | D | |
|---|---|---|---|---|
| pSJjyZoBpB | 0.714897 | 0.988601 | -0.202326 | 0.881356 |
| 4D8J6qvpJE | -1.854868 | -0.112148 | 0.658709 | 1.369165 |
| CkmZNeMYKM | -0.076568 | -1.382029 | -2.019700 | 0.904368 |
| 0o7JMHcMdq | 0.306416 | 1.143982 | -0.432829 | -0.041392 |
| vmKEUI4ykK | -0.410971 | 0.972225 | 1.812504 | 1.417916 |
| dSk1NoGvvf | -0.191573 | -0.050232 | -0.820390 | -2.027947 |
| YIEVw6z3a7 | 1.171626 | 0.666362 | 0.800444 | -0.007618 |
| QjOwfYzZGZ | -1.714355 | 1.189557 | 1.264362 | -0.422383 |
| vt2Ce822Fx | -0.838913 | 2.069088 | -1.223751 | 1.356260 |
| tlQeu7k9XU | 0.579015 | 0.141305 | 0.576735 | 1.494810 |
| zb4fEFx21h | -2.249923 | 0.877460 | 0.988045 | 0.406413 |
| HOnHEDDn1y | 0.278705 | 0.373367 | -0.199062 | 0.293664 |
| q3fNni6s93 | 0.943045 | 0.892671 | 0.967201 | 0.299227 |
| l5ltQQhDSJ | 0.310991 | 0.055744 | -0.040473 | 0.463064 |
| lq9R48BNLA | 1.679156 | -2.808187 | -0.840771 | 0.823816 |
| zPNL01yCDU | -2.613930 | -1.007763 | -0.963295 | -0.538252 |
| JsXyi9OXcP | 0.658004 | -0.129366 | -0.495296 | -0.367905 |
| cWcUufpQb9 | -1.066387 | -0.492941 | -1.823843 | 0.695233 |
| uWcqd3Z4zR | 0.698810 | 1.415073 | 0.305269 | 0.039034 |
| XBM4Y6Ib11 | -1.985595 | 0.985790 | -0.128100 | 0.630387 |
| xay4vpIkby | -0.595765 | 0.221349 | -0.309195 | -0.011029 |
| eFmnYBTs34 | 1.001177 | 0.441530 | -0.392937 | 0.270298 |
| uj6nnHqOST | -1.718152 | 0.385562 | 0.255971 | 1.439179 |
| hkfGhPKdAw | 0.848042 | -0.296234 | 0.162036 | -0.505019 |
| rkuqaTtNOW | 0.468582 | 2.591088 | 0.706702 | -0.829425 |
| RbBa6f9hPv | 0.754902 | -1.626977 | 0.935290 | -1.196510 |
| 4BrFH6DqTO | -0.768832 | 0.134553 | -0.443890 | 0.993485 |
| CgPHqORBrI | 1.284968 | 0.105240 | -0.568276 | -0.509625 |
| L1xX4nXrfm | -1.410510 | 1.226987 | -0.119667 | -1.536850 |
| 01b6CKjopS | 1.868615 | 1.294601 | 0.065204 | 0.052789 |
# Trick 5
# working with big dataset/skiprows
df_normal = pd.read_csv(reviews)
print(df_normal.shape)
df_big = pd.read_csv(reviews, skiprows = lambda x: x > 0 and np.random.rand() > 0.01)
print(df_big.shape)
(401963, 2) (4049, 2)
# Trick 6
listings_df.head(3).T
| 0 | 1 | 2 | |
|---|---|---|---|
| id | 2015 | 2695 | 3176 |
| name | Berlin-Mitte Value! Quiet courtyard/very central | Prenzlauer Berg close to Mauerpark | Fabulous Flat in great Location |
| host_id | 2217 | 2986 | 3718 |
| host_name | Ian | Michael | Britta |
| neighbourhood_group | Mitte | Pankow | Pankow |
| neighbourhood | Brunnenstr. Süd | Prenzlauer Berg Nordwest | Prenzlauer Berg Südwest |
| latitude | 52.5345 | 52.5485 | 52.535 |
| longitude | 13.4026 | 13.4046 | 13.4176 |
| room_type | Entire home/apt | Private room | Entire home/apt |
| price | 60 | 17 | 90 |
| minimum_nights | 4 | 2 | 62 |
| number_of_reviews | 118 | 6 | 143 |
| last_review | 2018-10-28 | 2018-10-01 | 2017-03-20 |
| reviews_per_month | 3.76 | 1.42 | 1.25 |
| calculated_host_listings_count | 4 | 1 | 1 |
| availability_365 | 141 | 0 | 220 |
host_series = listings_df.pop('host_name')
host_series
0 Ian
1 Michael
2 Britta
3 Jana
4 Bright
...
22547 Ulisses
22548 Jörg
22549 Martin
22550 Arte Luise
22551 Sebastian
Name: host_name, Length: 22552, dtype: object
listings_df.head(3).T
| 0 | 1 | 2 | |
|---|---|---|---|
| id | 2015 | 2695 | 3176 |
| name | Berlin-Mitte Value! Quiet courtyard/very central | Prenzlauer Berg close to Mauerpark | Fabulous Flat in great Location |
| host_id | 2217 | 2986 | 3718 |
| neighbourhood_group | Mitte | Pankow | Pankow |
| neighbourhood | Brunnenstr. Süd | Prenzlauer Berg Nordwest | Prenzlauer Berg Südwest |
| latitude | 52.5345 | 52.5485 | 52.535 |
| longitude | 13.4026 | 13.4046 | 13.4176 |
| room_type | Entire home/apt | Private room | Entire home/apt |
| price | 60 | 17 | 90 |
| minimum_nights | 4 | 2 | 62 |
| number_of_reviews | 118 | 6 | 143 |
| last_review | 2018-10-28 | 2018-10-01 | 2017-03-20 |
| reviews_per_month | 3.76 | 1.42 | 1.25 |
| calculated_host_listings_count | 4 | 1 | 1 |
| availability_365 | 141 | 0 | 220 |
# Trick 7
listings_reverse = listings_df.loc[:, ::-1]
listings_reverse.head(2).T
| 0 | 1 | |
|---|---|---|
| availability_365 | 141 | 0 |
| calculated_host_listings_count | 4 | 1 |
| reviews_per_month | 3.76 | 1.42 |
| last_review | 2018-10-28 | 2018-10-01 |
| number_of_reviews | 118 | 6 |
| minimum_nights | 4 | 2 |
| price | 60 | 17 |
| room_type | Entire home/apt | Private room |
| longitude | 13.4026 | 13.4046 |
| latitude | 52.5345 | 52.5485 |
| neighbourhood | Brunnenstr. Süd | Prenzlauer Berg Nordwest |
| neighbourhood_group | Mitte | Pankow |
| host_id | 2217 | 2986 |
| name | Berlin-Mitte Value! Quiet courtyard/very central | Prenzlauer Berg close to Mauerpark |
| id | 2015 | 2695 |
# Trick 8
n_count = listings_reverse.where(listings_reverse.neighbourhood_group
.isin(listings_reverse.neighbourhood_group.value_counts()
.nlargest(2).index), other='Other')
n_count.neighbourhood_group.value_counts()
Other 12424 Friedrichshain-Kreuzberg 5497 Mitte 4631 Name: neighbourhood_group, dtype: int64
# Trick 9
crit1 = listings_reverse.price < 60
crit2 = listings_reverse.number_of_reviews > 3
critera = reduce(lambda x, y: x & y, [crit1, crit2])
listings_reverse[critera].head(3).T
| 1 | 3 | 4 | |
|---|---|---|---|
| availability_365 | 0 | 297 | 26 |
| calculated_host_listings_count | 1 | 1 | 1 |
| reviews_per_month | 1.42 | 0.39 | 1.75 |
| last_review | 2018-10-01 | 2018-08-16 | 2018-11-04 |
| number_of_reviews | 6 | 25 | 197 |
| minimum_nights | 2 | 5 | 2 |
| price | 17 | 26 | 42 |
| room_type | Private room | Private room | Private room |
| longitude | 13.4046 | 13.3491 | 13.4151 |
| latitude | 52.5485 | 52.4989 | 52.5432 |
| neighbourhood | Prenzlauer Berg Nordwest | Schöneberg-Nord | Helmholtzplatz |
| neighbourhood_group | Pankow | Tempelhof - Schöneberg | Pankow |
| host_id | 2986 | 4108 | 17391 |
| name | Prenzlauer Berg close to Mauerpark | BerlinSpot Schöneberg near KaDeWe | BrightRoom with sunny greenview! |
| id | 2695 | 3309 | 7071 |
# Trick 10
listings_reverse.name.apply(type).value_counts()
<class 'str'> 22493 <class 'float'> 59 Name: name, dtype: int64
# Trick 11
listings_reverse['price_group'] = pd.cut(listings_reverse.price,
bins=[0, 20, 80, 120],
labels=['cheap', 'normal', 'i am too poor'])
listings_reverse.head().T
| 0 | 1 | 2 | 3 | 4 | |
|---|---|---|---|---|---|
| availability_365 | 141 | 0 | 220 | 297 | 26 |
| calculated_host_listings_count | 4 | 1 | 1 | 1 | 1 |
| reviews_per_month | 3.76 | 1.42 | 1.25 | 0.39 | 1.75 |
| last_review | 2018-10-28 | 2018-10-01 | 2017-03-20 | 2018-08-16 | 2018-11-04 |
| number_of_reviews | 118 | 6 | 143 | 25 | 197 |
| minimum_nights | 4 | 2 | 62 | 5 | 2 |
| price | 60 | 17 | 90 | 26 | 42 |
| room_type | Entire home/apt | Private room | Entire home/apt | Private room | Private room |
| longitude | 13.4026 | 13.4046 | 13.4176 | 13.3491 | 13.4151 |
| latitude | 52.5345 | 52.5485 | 52.535 | 52.4989 | 52.5432 |
| neighbourhood | Brunnenstr. Süd | Prenzlauer Berg Nordwest | Prenzlauer Berg Südwest | Schöneberg-Nord | Helmholtzplatz |
| neighbourhood_group | Mitte | Pankow | Pankow | Tempelhof - Schöneberg | Pankow |
| host_id | 2217 | 2986 | 3718 | 4108 | 17391 |
| name | Berlin-Mitte Value! Quiet courtyard/very central | Prenzlauer Berg close to Mauerpark | Fabulous Flat in great Location | BerlinSpot Schöneberg near KaDeWe | BrightRoom with sunny greenview! |
| id | 2015 | 2695 | 3176 | 3309 | 7071 |
| price_group | normal | cheap | i am too poor | normal | normal |
# Trick 12
tricky_df = pd.DataFrame({'one': ['a', 'b', 'c'], 'two': [[1, 2], [2, 3], [4, 5]]})
tricky_df
| one | two | |
|---|---|---|
| 0 | a | [1, 2] |
| 1 | b | [2, 3] |
| 2 | c | [4, 5] |
tricky_df.two.apply(pd.Series)
| 0 | 1 | |
|---|---|---|
| 0 | 1 | 2 |
| 1 | 2 | 3 |
| 2 | 4 | 5 |
tricky_df.explode('two')
| one | two | |
|---|---|---|
| 0 | a | 1 |
| 0 | a | 2 |
| 1 | b | 2 |
| 1 | b | 3 |
| 2 | c | 4 |
| 2 | c | 5 |
# Trick 13
listings_reverse['price_diff'] = listings_reverse.price.diff()
listings_reverse['price_diff'].sample(3)
7098 -19.0 18851 95.0 12348 -9.0 Name: price_diff, dtype: float64
listings_reverse['price_diff_percent'] = listings_reverse.price.pct_change()
listings_reverse['price_diff_percent'].sample(3)
7742 1.00 5899 -0.52 13966 1.00 Name: price_diff_percent, dtype: float64
# Trick 14
ind_df = listings_df.merge(reviews_df,
on='id',
indicator = True)
ind_df.head(3).T
| 0 | 1 | 2 | |
|---|---|---|---|
| id | 2015 | 2015 | 2015 |
| name | Berlin-Mitte Value! Quiet courtyard/very central | Berlin-Mitte Value! Quiet courtyard/very central | Berlin-Mitte Value! Quiet courtyard/very central |
| host_id | 2217 | 2217 | 2217 |
| neighbourhood_group | Mitte | Mitte | Mitte |
| neighbourhood | Brunnenstr. Süd | Brunnenstr. Süd | Brunnenstr. Süd |
| latitude | 52.5345 | 52.5345 | 52.5345 |
| longitude | 13.4026 | 13.4026 | 13.4026 |
| room_type | Entire home/apt | Entire home/apt | Entire home/apt |
| price | 60 | 60 | 60 |
| minimum_nights | 4 | 4 | 4 |
| number_of_reviews | 118 | 118 | 118 |
| last_review | 2018-10-28 | 2018-10-28 | 2018-10-28 |
| reviews_per_month | 3.76 | 3.76 | 3.76 |
| calculated_host_listings_count | 4 | 4 | 4 |
| availability_365 | 141 | 141 | 141 |
| date | 2016-04-11 00:00:00 | 2016-04-15 00:00:00 | 2016-04-26 00:00:00 |
| _merge | both | both | both |
# Trick 15
grouped = listings_df.groupby('room_type')
grouped.get_group('Entire home/apt').head(3).T
| 0 | 2 | 5 | |
|---|---|---|---|
| id | 2015 | 3176 | 9991 |
| name | Berlin-Mitte Value! Quiet courtyard/very central | Fabulous Flat in great Location | Geourgeous flat - outstanding views |
| host_id | 2217 | 3718 | 33852 |
| neighbourhood_group | Mitte | Pankow | Pankow |
| neighbourhood | Brunnenstr. Süd | Prenzlauer Berg Südwest | Prenzlauer Berg Südwest |
| latitude | 52.5345 | 52.535 | 52.533 |
| longitude | 13.4026 | 13.4176 | 13.416 |
| room_type | Entire home/apt | Entire home/apt | Entire home/apt |
| price | 60 | 90 | 180 |
| minimum_nights | 4 | 62 | 6 |
| number_of_reviews | 118 | 143 | 6 |
| last_review | 2018-10-28 | 2017-03-20 | 2018-07-23 |
| reviews_per_month | 3.76 | 1.25 | 0.15 |
| calculated_host_listings_count | 4 | 1 | 1 |
| availability_365 | 141 | 220 | 137 |